package com.abigdreamer.wordpress.ui;

import java.io.File;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import com.abigdreamer.wordpress.newspider.FileUtil;
import com.zving.framework.utility.StringUtil;

/**   
 * 
 * @author Darkness
 * @date 2013-6-29 下午08:18:28 
 * @version V1.0   
 */
public class WordpressArticleExtractor {

	public static String extract(File file) {
		String html = FileUtil.readText(file);
		Document doc = Jsoup.parseBodyFragment(html);
		Elements es = doc.select("div[id=postlist]");
		es = es.select("div");
		
		String path = "/gather/dixcus/images/" + file.getName().replace(".html", "");
		
		if(es.size() > 0) {
			
			Element el = es.get(0);
			Elements contentEl = el.select("div[class=pcb]").select("td[class=t_f]");
			
			Elements imgs = contentEl.select("img");
			for (Element element : imgs) {
				String imageSrc = element.attr("src");
				//System.out.println("imageSrc:" +imageSrc);
				boolean needGetter = false;
				if(!imageSrc.startsWith("http")) {
					if(!StringUtil.isEmpty(element.attr("zoomfile"))) {
						imageSrc = element.attr("zoomfile");
					} else {
						imageSrc = element.attr("file");
						needGetter = true;
					}
					
					if(!imageSrc.startsWith("http")) {
						imageSrc = "http://hkbici.com/" + imageSrc;
					}
					//continue;
				}
				
				if("http://hkbici.com/".equals(imageSrc)) {
					System.out.println(element.html());
					continue;
				}
				
				String[] imgInfos = imageSrc.split("/");
				
//				if(needGetter) {
					System.out.println("抓取图片:" + imageSrc);
					HttpImageGetter.file_put_contents("D:/Java/apache-tomcat-6.0.32/webapps" + path + "/" + imgInfos[imgInfos.length-1], imageSrc);
//				}
				
				element.attr("src", path + "/" + imgInfos[imgInfos.length-1]);
			}
			
			return contentEl.get(0).html();
		}
		return "未找到内容";
	}
	
//	@Test
	public  void fdsf() {
//		File file = new File("E:\\gather\\dixcus\\files\\thread-28993-1-391.html");
		File file = new File("E:\\gather\\dixcus\\files\\thread-1184769-1-504.html");
		String ssString = extract(file);
		System.out.println(ssString);
	}
}
